###############################################
#CLEANING RAW DATA
###############################################

#load packages

library(dplyr)
library(psych)

####LOAD DATA####

#load main data
data <- read.csv("./data/data_raw.csv")

####SCREENING####

#remove participants who failed attention checks
data$failed_1 <- ifelse(data$Q15 != "Strongly disagree",1,0)
data$failed_2 <- ifelse(data$attention_check_alt != "1880",1,0)
data <- subset(data, data$failed_1==0)
data <- subset(data, data$failed_2==0)


#check for any respondents who completed the survey too quickly (not within 3 SDs response time) - there are none
data$duration <- as.numeric(data$Duration..in.seconds.)
data$tooshort <- ifelse(data$duration < mean(data$duration, na.rm=T)-(3*sd(data$duration, na.rm=T)),1,0)
data <- subset(data, data$tooshort==0)

####CODING MAIN VARIABLES####

#code treatment status 
data$trt <- ifelse(data$rand==1, "control", ifelse(data$rand==2, "brit", ifelse(data$rand==3, "belg",NA)))
data$trt <- factor(data$trt, levels = c("control", "brit", "belg"))

#clean outcome variables

#donation
data$donation_amount <- as.numeric(data$lottery_1_TEXT)
data$donation_amount <-ifelse(data$donation_amount=="", 0, data$donation_amount)
data$donation_amount <- ifelse(is.na(data$donation_amount),0,data$donation_amount)
data$donation_dummy <- ifelse(data$donation_amount>0,1,0)

#perception of empire
data$empire_1 <-ifelse(data$empire_1=="", 0 , data$empire_1)
data$empire_1 <- gsub(" (very negative)", "", data$empire_1, fixed=T)
data$empire_1 <- gsub(" (very positive)", "", data$empire_1, fixed=T)
data$empire_1 <- as.numeric(data$empire_1)

#explicit prejudice
data$prej_direct_1 <- gsub(" (very much)", "", data$prej_direct_1, fixed=T)
data$prej_direct_1 <- gsub(" (not at all)", "", data$prej_direct_1, fixed=T)
data$prej_direct_1 <- as.numeric(data$prej_direct_1)

#national identification outcomes (3 items -> standardised PCA)
data$Natid3_1 <- gsub(" (very close)", "", data$Natid3_1, fixed=T)
data$Natid3_1 <- gsub(" (not at all close)", "", data$Natid3_1, fixed=T)
data$Natid3_1 <- as.numeric(data$Natid3_1)
data$Natid2_1 <- gsub(" (always)", "", data$Natid2_1, fixed=T)
data$Natid2_1 <- gsub(" (never)", "", data$Natid2_1, fixed=T)
data$Natid2_1 <- as.numeric(data$Natid2_1)
data$Natid1_1 <- gsub(" (strongly agree)", "", data$Natid1_1, fixed=T)
data$Natid1_1 <- gsub(" (strongly disagree)", "", data$Natid1_1, fixed=T)
data$Natid1_1 <- as.numeric(data$Natid1_1)

#create PCA
pca_vars <- data %>%
  select(Natid1_1, Natid2_1, Natid3_1)

results <- prcomp(
  na.omit(pca_vars),
  center = TRUE,
  scale. = TRUE
)

data$natid_pca <- NA
complete_rows <- complete.cases(pca_vars)
data$natid_pca[complete_rows] <- results$x[, 1]

# standardise using control group
control_mean <- mean(data$natid_pca[data$trt == "control"], na.rm = TRUE)
control_sd   <- sd(data$natid_pca[data$trt == "control"],   na.rm = TRUE)
data$natid_pca <- (data$natid_pca - control_mean) / control_sd

#whataboutism
data$comparison_1 <- gsub(" (strongly agree)", "", data$comparison_1, fixed=T)
data$comparison_1 <- gsub(" (strongly disagree)", "", data$comparison_1, fixed=T)
data$comparison_1 <- as.numeric(data$comparison_1)

#implicit prejudice
data$list_items <- ifelse(data$rand_list==1, data$prej_list_nonsen, ifelse(data$rand_list==2, data$prej_list_sen,NA))
data$list_items <- as.numeric(data$list_items)
data$sens <- ifelse(data$rand_list==1, 0, ifelse(data$rand_list==2, 1,NA))


####CODING COVARIATES####

#pre-treatment national identification (2 items -> PCA)
data$natid_covar1_1 <- gsub(" (very much)", "", data$natid_covar1_1, fixed=T)
data$natid_covar1_1 <- gsub(" (not at all)", "", data$natid_covar1_1, fixed=T)
data$natid_covar1_1 <- as.numeric(data$natid_covar1_1)
data$natid_covar2_1 <- gsub(" (very attached)", "", data$natid_covar2_1, fixed=T)
data$natid_covar2_1 <- gsub(" (not at all attached)", "", data$natid_covar2_1, fixed=T)
data$natid_covar2_1 <- as.numeric(data$natid_covar2_1)

#create PCA
pca2 <- data %>% dplyr::select(natid_covar1_1, natid_covar2_1)
pca2[] <- lapply(pca2, function(x) { x[is.infinite(x)] <- NA; x })
PCA_results2 <- psych::principal(pca2, nfactors = 1, scores = TRUE, missing = TRUE)
data$natid_covar_pca <- as.numeric(PCA_results2$scores)

#split into quintiles for HTE analysis
data$nat_quint <- ntile(data$natid_covar_pca, 5) 


#left/right self-placement
data$lr_1 <- gsub(" (left)", "", data$lr_1, fixed=T)
data$lr_1 <- gsub(" (right)", "", data$lr_1, fixed=T)
data$lr_1 <- as.numeric(data$lr_1)

#income
income_map <- c(
  "Less than £10,000"    = 1,
  "£10,000 - £15,999"    = 2,
  "£16,000 - £19,999"    = 3,
  "£20,000 - £29,999"    = 4,
  "£30,000 - £39,999"    = 5,
  "£40,000 - £49,999"    = 6,
  "£50,000 - £59,999"    = 7,
  "£60,000 - £69,999"    = 8,
  "£70,000 - £79,999"    = 9,
  "£80,000 - £89,999"    = 10,
  "£90,000 - £99,999"    = 11,
  "£100,000 - £149,999"  = 12,
  "More than £150,000"   = 13
)

data <- data %>%
  mutate(income = recode(Household.Income..GBP., !!!income_map, .default = NA_real_))


# education
edu_level_map <- c(
  "No formal qualifications"                               = 0,
  "Technical/community college"                            = 1,
  "Technical/community college, Technical/community college"= 1,
  "Secondary education (e.g. GED/GCSE)"                    = 2,
  "High school diploma/A-levels"                           = 3,
  "Undergraduate degree (BA/BSc/other)"                    = 4,
  "Graduate degree (MA/MSc/MPhil/other)"                   = 5,
  "Doctorate degree (PhD/other)"                           = 6
)

data <- data %>%
  mutate(
    education_level = recode(Highest.education.level.completed, !!!edu_level_map, .default = NA_real_),
    education = case_when(
      education_level <= 3 ~ "low",
      education_level >= 4 ~ "high",
      TRUE ~ NA_character_
    )
  )

#employment status
data$newjob <- ifelse(data$Employment.Status=="Due to start a new job within the next month",1,0)
data$unemployed <- ifelse(data$Employment.Status=="Unemployed (and job seeking)",1,0)
data$fulltime <- ifelse(data$Employment.Status=="Full-Time",1,0)
data$notpaid <- ifelse(data$Employment.Status=="Not in paid work (e.g. homemaker', 'retired or disabled)",1,0)
data$parttime <- ifelse(data$Employment.Status=="Part-Time",1,0)

#gender
data$female <- ifelse(data$Sex=="Female",1,0)   

#place of birth
data$birth_s_eng <- ifelse(data$UK.area.of.birth =="East of England (East Anglia, Bedfordshire and Hertfordshire, Essex)" | 
                             data$UK.area.of.birth =="London, England" | data$UK.area.of.birth == "South East, England (Berkshire, Buckinghamshire, and Oxfordshire, Surrey, Sussex, Kent, Hampshire and Isle of Wight)"|
                             data$UK.area.of.birth =="South West, England (Gloucestershire, Wiltshire and Bristol/Bath area, Dorset and Somerset, Cornwall and Isles of Scilly, Devon)" | 
                             data$UK.area.of.birth =="London, England",1,0)

data$birth_n_eng <- ifelse(data$UK.area.of.birth =="North East, England    (Tees Valley, Durham, Northumberland and Tyne and Wear)" | 
                             data$UK.area.of.birth =="North West, England (Cumbria, Greater Manchester, Lancashire, Merseyside)" | 
                             data$UK.area.of.birth == "Yorkshire and the Humber, England (East Riding, North Lincolnshire and Yorkshire)",1,0)

data$birth_midlands <- ifelse(data$UK.area.of.birth =="East Midlands, England (Derbyshire and Nottinghamshire, Leicestershire, Rutland and Northamptonshire, Lincolnshire)" | 
                                data$UK.area.of.birth =="West Midlands, England (Herefordshire, Worcestershire and Warwickshire, Shropshire and Staffordshire, West Midlands)",1,0)

data$birth_scotland <- ifelse(data$UK.area.of.birth =="Scotland",1,0)
data$birth_wales <- ifelse(data$UK.area.of.birth =="Wales",1,0)
data$birth_ni <- ifelse(data$UK.area.of.birth =="Northern Ireland",1,0)

#place of residence
data$residence_s_eng <- ifelse(data$Current.UK.area.of.residence =="East of England (East Anglia, Bedfordshire and Hertfordshire, Essex)" | 
                                 data$Current.UK.area.of.residence =="London, England" | data$Current.UK.area.of.residence == "South East, England (Berkshire, Buckinghamshire, and Oxfordshire, Surrey, Sussex, Kent, Hampshire and Isle of Wight)"|
                                 data$Current.UK.area.of.residence =="South West, England (Gloucestershire, Wiltshire and Bristol/Bath area, Dorset and Somerset, Cornwall and Isles of Scilly, Devon)" | 
                                 data$Current.UK.area.of.residence =="London, England",1,0)

data$residence_n_eng <- ifelse(data$Current.UK.area.of.residence =="North East, England    (Tees Valley, Durham, Northumberland and Tyne and Wear)" | 
                                 data$Current.UK.area.of.residence =="North West, England (Cumbria, Greater Manchester, Lancashire, Merseyside)" | 
                                 data$Current.UK.area.of.residence == "Yorkshire and the Humber, England (East Riding, North Lincolnshire and Yorkshire)",1,0)

data$residence_midlands <- ifelse(data$Current.UK.area.of.residence =="East Midlands, England (Derbyshire and Nottinghamshire, Leicestershire, Rutland and Northamptonshire, Lincolnshire)" | 
                                    data$Current.UK.area.of.residence =="West Midlands, England (Herefordshire, Worcestershire and Warwickshire, Shropshire and Staffordshire, West Midlands)",1,0)

data$residence_scotland <- ifelse(data$Current.UK.area.of.residence =="Scotland",1,0)
data$residence_wales <- ifelse(data$Current.UK.area.of.residence =="Wales",1,0)
data$residence_ni <- ifelse(data$Current.UK.area.of.residence =="Northern Ireland",1,0)


#####CREATE SUBSETS FOR ANALYSIS####
data1 <- subset(data, data$trt!="belg")
data2 <- subset(data, data$trt!="control")
data3 <- subset(data, data$trt!="brit")

data1$trt <- ifelse(data1$trt=="brit",1,0)
data2$trt <- ifelse(data2$trt=="belg",1,0)
data3$trt <- ifelse(data3$trt=="belg",1,0)

#save
setwd("./data/")
write.csv(data, "data_clean.csv")
write.csv(data1, "data1.csv")
write.csv(data2, "data2.csv")
write.csv(data3, "data3.csv")

